import requests
from bs4 import BeautifulSoup
from pydash import py_
import threading
import csv

CSDN_GAUSSDB_URL_HEAD = 'https://blog.csdn.net'
ROOT_PATH = '/community/home-api/v1/get-business-list'

datasets = []

#  获取文章分类和标签
def get_tags_and_process(url):
    response = requests.get(url=url, headers={
                            'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36 Edg/105.0.1343.53'}) # 必须设置user-agent，CSDN设置了反爬虫策略，没有user-agent什么都爬不到
    response.encoding = 'utf-8'
    html = response.text
    bs = BeautifulSoup(html, "html.parser")
    # 打开每篇文章抓取相应a标签中的文章分类和标签
    process_and_tags = bs.select('.tags-box.artic-tag-box .tag-link')
    # 数组中第一个值是文章分类，其余的是标签
    process = py_.shift(process_and_tags).getText()

    def filter_tag(item):
        return item.getText()
    tags = py_.map(process_and_tags, filter_tag)
    tags_str = '｜'.join(tags)
    return process, tags_str

# 线程处理函数
def handler(items):
    print('** [Starting] ** ')
    index = 0
    for item in items:
        index += 1
        print('  >> Progress: ' + str(index) + '/' + str(len(items)))
        title = item['title']
        url = item['url']
        description = item['description']
        process, tags = get_tags_and_process(url)
        datasets.append([title, description, url, process, tags])

# 多线程抓取
def mul_thread_handler(links):
    thread = threading.Thread(target=handler, args=([links]))
    thread.start()
    return thread

# 貌似不需要分HEAD和PATH，懒得改了
url = [CSDN_GAUSSDB_URL_HEAD + ROOT_PATH]
# 页码
page = 1
# 每页数量
PAGESIZE = 100
total = PAGESIZE
count = 0
all_items = []

# 以下循环是为了处理分页
while count <= total:
    response = requests.get(url=url[0], params={'page': page, 'size': PAGESIZE, 'year': '', 'month': '', 'orderby': '', 'businessType': 'blog', 'noMore': 'true', 'username': 'GaussDB'}, headers={'Accept': 'application/json', 'accept-encoding': 'gzip, deflate, br',
                        'accept-language': 'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7', 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36 Edg/105.0.1343.53'})
    response.encoding = 'utf-8'
    total = response.json()['data']['total']
    count = page * PAGESIZE
    print(count)
    all_items += response.json()['data']['list']
    print(str(len(all_items)))
    page += 1

# 以下不是县城数量啊，把一个大数组分成100个一份的若干数组，线程数量=大数组长度/chunk_size
chunk_size = 100
chunk = py_.chunk(all_items, chunk_size)

threads = py_(chunk).map(mul_thread_handler).value()

def join_all(thread):
    thread.join()

py_(threads).map(join_all).value()

print('***************************** At last *****************************')


# 写入数据, 这里直接追加到data/data.csv了
# headers = ('title', 'content', 'link', 'process', 'tag')
with open('../data/data.csv', 'a', encoding='utf-8', newline='') as f:
    write = csv.writer(f)  # 创建writer对象
    # write.writerow(headers)
    for n in range(len(datasets)):
        write.writerow(datasets[n])
